{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest for Titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(10)\n",
    "\n",
    "import pandas as pd\n",
    "import re\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "from decisiontree import *\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(\"datasets/titanic/titanic_training.csv\")\n",
    "test = pd.read_csv(\"datasets/titanic/titanic_testing_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.fillna(train.mean(),inplace=True)\n",
    "test.fillna(train.mean(),inplace=True)\n",
    "\n",
    "mode_sex = list(train[\"sex\"])\n",
    "mode_ticket = list(train[\"ticket\"])\n",
    "mode_embarked = list(train[\"embarked\"])\n",
    "reps = {\"sex\":max(set(mode_sex),key=mode_sex.count),\n",
    "        \"ticket\":max(set(mode_ticket),key=mode_ticket.count),\n",
    "        \"embarked\":max(set(mode_embarked),key=mode_embarked.count)}\n",
    "\n",
    "train.fillna(reps,inplace=True)\n",
    "test.fillna(reps,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#one hot cabin training set\n",
    "cabin = train[[\"cabin\"]].copy()\n",
    "cabin[\"cabin\"] = cabin[\"cabin\"].str.slice(0,1)\n",
    "mode_cabin = list(cabin[\"cabin\"])\n",
    "cabin[\"cabin\"] = cabin[\"cabin\"].fillna(\n",
    "                max(set(mode_cabin),\n",
    "                key=mode_cabin.count))\n",
    "one_hot = pd.get_dummies(cabin[\"cabin\"])\n",
    "train = pd.concat([train, cabin], axis=1, sort=False)\n",
    "train = train.loc[:,~train.columns.duplicated()] #removed extra T col\n",
    "\n",
    "#one hot cabin test set\n",
    "cabin = test[[\"cabin\"]].copy()\n",
    "cabin[\"cabin\"] = cabin[\"cabin\"].str.slice(0,1)\n",
    "mode_cabin = list(cabin[\"cabin\"])\n",
    "cabin[\"cabin\"] = cabin[\"cabin\"].fillna(\n",
    "                max(set(mode_cabin),\n",
    "                key=mode_cabin.count))\n",
    "one_hot = pd.get_dummies(cabin[\"cabin\"])\n",
    "# test.drop(columns=[\"cabin\"],axis=1,inplace=True)\n",
    "test = pd.concat([test, cabin], axis=1, sort=False)\n",
    "test = test.loc[:,~train.columns.duplicated()] #removed extra T col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#one hot sex training set\n",
    "train = pd.concat([train,\n",
    "                   pd.get_dummies(train[\"sex\"], \n",
    "                      prefix=\"sex\")],axis=1)\n",
    "train.drop([\"sex\"], axis=1,inplace=True)\n",
    "\n",
    "#one hot sex test set\n",
    "test = pd.concat([test,\n",
    "                  pd.get_dummies(test[\"sex\"], \n",
    "                     prefix=\"sex\")],axis=1)\n",
    "test.drop([\"sex\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#one hot embarked training set\n",
    "train = pd.concat([train,\n",
    "                   pd.get_dummies(train[\"embarked\"],\n",
    "                        prefix=\"embarked\")],axis=1)\n",
    "train.drop([\"embarked\"], axis=1, inplace=True)\n",
    "\n",
    "#one hot embarked test set\n",
    "test = pd.concat([test,\n",
    "                  pd.get_dummies(test[\"embarked\"], \n",
    "                         prefix=\"embarked\")],axis=1)\n",
    "test.drop([\"embarked\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getnum(s):\n",
    "    s = re.findall(r\"\\d+\",str(s))\n",
    "    return int(s[0]) if len(s)>=1 else -1\n",
    "\n",
    "train[\"ticket\"] = train[\"ticket\"].apply(lambda s: getnum(s))\n",
    "test[\"ticket\"] = test[\"ticket\"].apply(lambda s: getnum(s))\n",
    "train[\"ticket\"] = train[\"ticket\"].apply(lambda s: train[\"ticket\"].mean() if s==-1 else s)\n",
    "test[\"ticket\"] = test[\"ticket\"].apply(lambda s: test[\"ticket\"].mean() if s==-1 else s)\n",
    "\n",
    "train_y = train[\"survived\"].values\n",
    "train.drop([\"survived\"], axis=1, inplace=True)\n",
    "\n",
    "train.drop(columns=[\"cabin\"],axis=1,inplace=True)\n",
    "test.drop(columns=[\"cabin\"],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((800, 11), (310, 11))"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = train.values\n",
    "x_test = test.values\n",
    "x_train, x_val, y_train, y_val = \\\n",
    "train_test_split(train, train_y, test_size=0.2)\n",
    "x_train.shape,test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-03-27 19:01:15,524\tWARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.\n",
      "2019-03-27 19:01:15,527\tINFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-03-27_19-01-15_61650/logs.\n",
      "2019-03-27 19:01:15,639\tINFO services.py:363 -- Waiting for redis server at 127.0.0.1:61363 to respond...\n",
      "2019-03-27 19:01:15,758\tINFO services.py:363 -- Waiting for redis server at 127.0.0.1:24281 to respond...\n",
      "2019-03-27 19:01:15,763\tINFO services.py:760 -- Starting Redis shard with 3.44 GB max memory.\n",
      "2019-03-27 19:01:15,778\tINFO services.py:1384 -- Starting the Plasma object store with 5.15 GB memory using /tmp.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 138 ms, sys: 81.5 ms, total: 219 ms\n",
      "Wall time: 3.87 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "rf = RandomForest(rand_features=\"log\",num_trees=20,header=\"titanic_randomforest\")\n",
    "rf.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy: 0.62375\n",
      "validation accuracy: 0.57\n",
      "saved predictions\n",
      "CPU times: user 487 ms, sys: 17.9 ms, total: 505 ms\n",
      "Wall time: 512 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_preds = rf.predict(x_train,\"train\")\n",
    "print(\"training accuracy:\",np.mean(train_preds==y_train))\n",
    "val_preds = rf.predict(x_val,\"val\")\n",
    "print(\"validation accuracy:\",np.mean(val_preds==y_val))\n",
    "rf.predict(x_test,\"test\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
